{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 结巴分词的三种模式\n",
    "### 1 默认模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"小华毕业于中国科学院大学，后赴美深造。\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 1.037 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小华', '毕业', '于', '中国科学院', '大学', '，', '后', '赴美', '深造', '。']\n"
     ]
    }
   ],
   "source": [
    "print(list(jieba.cut(s)))  # 默认模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小', '华', '毕业', '于', '中国科学院', '大学', '，', '后', '赴美', '深造', '。']\n"
     ]
    }
   ],
   "source": [
    "print(list(jieba.cut(s, HMM=False)))  # 默认模式关闭隐马尔科夫， 小华不能准确分出来"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2 全模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = jieba.cut(s, cut_all=True)  # 全模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小', '华', '毕业', '于', '中国', '中国科学院', '科学', '科学院', '学院', '大学', '', '', '后', '赴美', '深造', '', '']\n"
     ]
    }
   ],
   "source": [
    "print(list(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小', '华', '毕业', '于', '中国', '中国科学院', '科学', '科学院', '学院', '大学', '', '', '后', '赴美', '深造', '', '']\n"
     ]
    }
   ],
   "source": [
    "print(list(jieba.cut(s, cut_all=True, HMM=False)))  # 关闭HMM的全模式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3 搜索引擎模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['小华', '毕业', '于', '中国', '科学', '学院', '科学院', '中国科学院', '大学', '，', '后', '赴美', '深造', '。']\n"
     ]
    }
   ],
   "source": [
    "seg_list = jieba.cut_for_search(s) # 搜索引擎模式\n",
    "print(list(seg_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
